{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"_32CgRZySCtS"},"outputs":[],"source":["# Install the Whisper library using pip from the OpenAI GitHub repository\n","!pip install git+https://github.com/openai/whisper.git\n","import whisper\n","\n","# Install FFmpeg by updating the package list and then installing it using ap\n","!sudo apt update && sudo apt install ffmpeg"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"fXa6lxdyS8-d"},"outputs":[],"source":["# Import libraries\n","\n","import torch\n","import os\n","import numpy as np\n","import librosa.display\n","import matplotlib\n","import pandas as pd\n","import librosa"]},{"cell_type":"markdown","metadata":{"id":"Fv2LBEm5THdF"},"source":["## Load Audio Data from gdrive"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18031,"status":"ok","timestamp":1694613938704,"user":{"displayName":"Camille Landesvatter","userId":"08465821237712601560"},"user_tz":-120},"id":"cFU4Z38YTHLI","outputId":"16844764-71e4-4fc4-d595-c1479ef098d4"},"outputs":[],"source":["# Mount the drive that contains wav files\n","from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":308,"status":"ok","timestamp":1694615035172,"user":{"displayName":"Camille Landesvatter","userId":"08465821237712601560"},"user_tz":-120},"id":"DKHoCIaPTWLE","outputId":"f0cd9c1e-1f60-422b-eafe-0eaf4c3f86c1"},"outputs":[],"source":["# Find all file paths containing WAV files using librosa's utility function.\n","# Note: Files with a size less than 100KB (~ <2 seconds) were excluded in a previous step,\n","# as librosa.load cannot resample them to 16000Hz.\n","\n","\n","# Define the directory path where the WAV files are located.\n","files = librosa.util.find_files('path-containing-wav-files', ext='wav', recurse=False)\n","\n","# Convert the list of file paths to a NumPy array.\n","files = np.asarray(files)\n","\n","#len(files)\n","#files"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":224,"status":"ok","timestamp":1694615037150,"user":{"displayName":"Camille Landesvatter","userId":"08465821237712601560"},"user_tz":-120},"id":"o4NBqtel2GhQ","outputId":"78ae3489-9480-4ec5-c382-280594d9bdf4"},"outputs":[],"source":["# Load and resample all WAV files to 16,000 Hz using librosa.load.\n","\n","df=[]\n","for i in files:\n","  df.append(librosa.load(i, sr=16000))\n","  #df.append(librosa.load(i)) #it also works without resampling\n","\n","#print(df)\n","\n","librosa_values=pd.DataFrame(df, columns=['value', 'sr'])\n","#librosa_values\n","\n","# Extract the 'value' column from the DataFrame\n","value = librosa_values.value\n","\n","#value"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":67826,"status":"ok","timestamp":1694615118530,"user":{"displayName":"Camille Landesvatter","userId":"08465821237712601560"},"user_tz":-120},"id":"0bHNyCkt2e_7","outputId":"46230086-cc3c-4d87-8a64-38171898bd81"},"outputs":[],"source":["# Load the Whisper model named \"large\" for transcription.\n","# You can also consider using the \"medium\" model if needed.\n","model = whisper.load_model(\"large\")\n","\n","# To transcribe audio data, use the model with the desired audio, for example:\n","#result = model.transcribe(value[1])\n","#result[\"text\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1694615159943,"user":{"displayName":"Camille Landesvatter","userId":"08465821237712601560"},"user_tz":-120},"id":"0J-Mrss-26zC","outputId":"efa89aa3-86cf-404c-b491-fb0c73b15992"},"outputs":[{"name":"stdout","output_type":"stream","text":[" I'm too young and I don't think I have the seriousness yet to be able to like work regularly enough to provide for a wife and then children. My credit score isn't even good enough to buy a house.\n"]}],"source":["# Perform transcription for each audio sample in 'value' using the loaded Whisper model.\n","\n","transcription=[]\n","for i in value:\n","    transcription.append(model.transcribe(i))\n","#len(transcription)\n","print(transcription[0][\"text\"])\n","print(transcription[8][\"text\"])\n","\n","# Create an empty list 'strings' to store the transcribed text.\n","strings=[]\n","\n","#x = range(2402)\n","x = range(4804) # Process the first 4804 samples\n","for n in x:\n","  strings.append(transcription[n][\"text\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NHXY4QK96vWG"},"outputs":[],"source":["# Merge the transcribed text ('strings') with a 'data' DataFrame by adding a new column named 'transcription'.\n","data=[]\n","data[\"transcription\"]=strings\n","data\n","\n","# To access the transcribed text for a specific sample, you can use:\n","#transcription[0][\"text\"]"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
